The purpose of the workshop is to give some sense about how to use R in scientific research and best practices of open science. Here are the major contents we will go through:
Say, we want to make an image about global eddy-covariance flux tower sites. How would you do it?
# Read in a data.table that contains site info
sites <- fread("data/sites_fluxnet2015.csv")
head(sites)
## siteID site_name site_desc country province_state
## <char> <char> <char> <char> <char>
## 1: AR-SLu San Luis mixed forest Argentina <NA>
## 2: AR-SLu San Luis mixed forest Argentina <NA>
## 3: AR-Vir Virasoro evergreen needleleaf forest Argentina <NA>
## 4: AR-Vir Virasoro evergreen needleleaf forest Argentina <NA>
## 5: AT-Neu Neustift grassland with wetland Austria <NA>
## 6: AT-Neu Neustift grassland with wetland Austria <NA>
## data_tier2015 data_start2015 data_end2015 lat lon elev_m IGBP
## <int> <int> <int> <num> <num> <num> <char>
## 1: 1 2009 2011 -33.46480 -66.4598 NA MF
## 2: 2 2009 2011 -33.46480 -66.4598 NA MF
## 3: 1 2009 2012 -28.23950 -56.1886 NA ENF
## 4: 2 2009 2012 -28.23950 -56.1886 NA ENF
## 5: 1 2002 2012 47.11667 11.3175 970 GRA
## 6: 2 2002 2012 47.11667 11.3175 970 GRA
## MAT MAP
## <num> <num>
## 1: NA 400
## 2: NA 400
## 3: NA NA
## 4: NA NA
## 5: 6.3 852
## 6: 6.3 852
Make a terra::vect object to store the coordinates in a
geospatial format:
# epsg:4326 means lat/lon coordinate reference system
sites_vec <- vect(sites, geom = c("lon", "lat"), crs = "epsg:4326")
plot(sites_vec)
Load the internal “World” dataset:
data("World")
plot(World[2], main = "")
Download map tiles as our background image:
# Change the `provider` to see different styles
tile <- maptiles::get_tiles(World,
zoom = 2,
provider = "OpenStreetMap"
)
flux_map <- tm_shape(tile) +
tm_rgb() +
tm_layout(frame = FALSE)
flux_map
Now, we add the World data onto this map:
flux_map <- flux_map +
tm_shape(World) +
tm_borders()
flux_map
Add flux tower site locations as points:
flux_map <- flux_map +
tm_shape(sf::st_as_sf(sites_vec)) +
tm_dots(col = "IGBP", palette = hcl.colors(12, "dynamic"),
shape = 21, size = 0.1, border.col = "white"
) +
tm_layout(legend.outside = TRUE, legend.outside.size = 0.1)
flux_map
Note that the flux tower sites in Europe are a bit too dense to see, can we make them more visible? (Hint: we have a lot of empty space on this map)
# Define Europe boundary,Note that the order is `xmin, xmax, ymin, ymax`
eu_bbox <- ext(-10, 38, 33, 63)
# Zoom in the map, note that the order of `bbox` is `xmin, ymin, xmax, ymax`
eu_map <- tm_shape(tile, bbox = eu_bbox[c(1, 3, 2, 4)]) +
tm_rgb() +
tm_layout(frame = FALSE) +
tm_shape(World) +
tm_polygons(alpha = 0) +
tm_shape(sf::st_as_sf(sites_vec)) +
tm_dots(
col = "IGBP", palette = hcl.colors(12, "dynamic"),
shape = 21, size = 0.1, border.col = "white",
legend.show = FALSE
)
eu_map
# Construct a boundary polygon
eu_bbox_vect <- as.polygons(eu_bbox, crs = "epsg:4326")
# Draw the polygon on the map canvas
flux_map <- flux_map + tm_shape(sf::st_as_sf(eu_bbox_vect)) +
tm_polygons(alpha = 0, lwd = 2, border.col = "white")
print(flux_map)
print(eu_map, vp = grid::viewport(x = 0.05, y = 0.2,
just = c("left", "bottom"),
width = 0.13, height = 0.2)
)
What if we want to make this map interactive? One thing great in
tmap is that our code is transferable for interactive
applications such as a web-based map.
tmap_mode("view")
## tmap mode set to interactive viewing
flux_map
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1; sf_use_s2() is TRUE
Bonus: let’s make a map to show Harvard Forest EMS flux tower!
# Find the site
hf_site <- sites_vec[sites_vec$siteID == "US-Ha1", ]
# Make a 3000 m buffer
hf_site_buf <- buffer(hf_site, width = 3000)
plot(hf_site, xlim = c(-72.3, -72.05), ylim = c(42.45, 42.6))
plot(hf_site_buf, add = TRUE)
tile <- maptiles::get_tiles(ext(hf_site_buf),
zoom = 15,
crop = TRUE,
provider = "Esri.WorldImagery"
)
plot(tile)
hf_tower <- tm_shape(tile) +
tm_rgb() +
tm_shape(st_as_sf(hf_site)) +
tm_dots(col = "red", size = 0.5, shape = 24,
border.col = "white", border.lwd = 2
) +
tm_text(text = "site_name", col = "white", ymod = -1) +
tm_scale_bar(position = c("right", "bottom"), width = 0.3,
text.size = 1, text.color = "white"
) +
tm_compass(position = c("right", "top"),
text.color = "white", size = 3, type = "4star"
) +
tm_layout(frame = FALSE)
hf_tower
## Compass not supported in view mode.
## stars object downsampled to 1163 by 860 cells. See tm_shape manual (argument raster.downsample)
## Scale bar width set to 100 pixels
## Symbol shapes other than circles or icons are not supported in view mode.
tmap_mode("plot")
## tmap mode set to plotting
print(hf_tower)
## stars object downsampled to 1163 by 860 cells. See tm_shape manual (argument raster.downsample)
In this section, we are going to investigate the relationship between plants’ growing season length and annual carbon sequestration. We use satellite remote sensing.
The satellite-based greenness measurements are stored in the
data/us-ha1_evi2.csv.
evi2_dt <- fread("data/us-ha1_evi2.csv")
evi2_dt <- evi2_dt[year(Date) > 1991]
head(evi2_dt)
## Category ID Latitude Longitude Date evi2 qa fill snow
## <lgcl> <char> <num> <num> <IDat> <num> <int> <lgcl> <lgcl>
## 1: NA US-Ha1 42.5378 -72.1715 1992-01-08 0.1400310 5440 FALSE FALSE
## 2: NA US-Ha1 42.5378 -72.1715 1992-02-02 0.1651088 5440 FALSE FALSE
## 3: NA US-Ha1 42.5378 -72.1715 1992-02-18 0.1252856 5440 FALSE FALSE
## 4: NA US-Ha1 42.5378 -72.1715 1992-03-21 0.1609408 5440 FALSE FALSE
## 5: NA US-Ha1 42.5378 -72.1715 1992-04-06 0.1996971 5440 FALSE FALSE
## 6: NA US-Ha1 42.5378 -72.1715 1992-04-13 0.1891112 5440 FALSE FALSE
## cloud cloudShadow clear sensor
## <lgcl> <lgcl> <int> <char>
## 1: FALSE FALSE 1 L5
## 2: FALSE FALSE 1 L5
## 3: FALSE FALSE 1 L5
## 4: FALSE FALSE 1 L5
## 5: FALSE FALSE 1 L5
## 6: FALSE FALSE 1 L5
Let’s plot these values as a point time series:
cols <- RColorBrewer::brewer.pal(8, "Set2")
plot(evi2_dt[, .(Date, evi2)],
ylim = c(0, 1),
xlab = "Date", ylab = "EVI2",
mgp = c(2, 0.5, 0),
bty = "L", las = 1,
cex = 0
)
sensor_names <- c("L5", "L7", "L8", "L9", "HLSS30", "HLSL30")
for (i in seq_along(sensor_names)) {
points(evi2_dt[sensor == sensor_names[i], .(Date, evi2)],
col = cols[i],
pch = 16,
cex = 0.5
)
}
legend(grconvertX(0.5, "ndc"), grconvertY(0.9, "ndc"),
xjust = 0.5,
legend = sensor_names,
pch = 16, col = cols, bty = "n",
pt.cex = 0.5, xpd = NA,
horiz = TRUE
)
To retrieve phenology from this time series, we need to use a curve
fitting algorithm to interpolate the gaps between observations. Here, we
use the blsp
package from Github to perform this task. Check the website for
information of the package.
devtools::install_github("ncsuSEAL/Bayesian_LSP", build_vignettes = FALSE)
No need to understand the details of the following code chunk except that it will give us phenological dates from the satellite observations. It will take some time for the model to run.
library(blsp)
avgfit <- FitAvgModel(
date_vec = evi2_dt$Date,
vi_vec = evi2_dt$evi2,
model = "dblog7"
)
blsp_fit <- FitBLSP(
date_vec = evi2_dt$Date,
vi_vec = evi2_dt$evi2,
model = "dblog7",
init_values = avgfit,
start_yr = 1991,
end_yr = 2023,
cred_int_level = 0.95,
opt = list(method = "threshold"),
verbose = TRUE
)
## Initialize model...
## Sampling (could have multiple chains)...
## total interation times:25000
## Estimate phenometrics...
## Done!
PlotBLSP(blsp_fit)
phenos <- blsp_fit$phenos
gsl_dt <- phenos[, .(year = as.integer(Year), GSL = Dormancy - Greenup)]
plot(gsl_dt[, .(year, GSL)], type = "b")
Now, we load annual gross primary productivity (GPP) data measured by the EMS flux tower.
gpp_dt <- fread("data/us-ha1_annual_gpp.csv")
par(mfrow = c(2, 1))
plot(gsl_dt[, .(year, GSL)], type = "b")
plot(gpp_dt[, .(year, GPP)], type = "b",
ylab = expression(GPP~(umol / m^2 / year))
)
Now, we will fit a linear regression model to investigate the relationship between annual GPP and growing season length.
# Merge the data together
com_dt <- merge(gsl_dt, gpp_dt, by.x = "year", by.y = "year")
head(com_dt)
## Key: <year>
## year GSL GPP
## <int> <num> <num>
## 1: 1992 158 1170.390
## 2: 1993 184 1359.253
## 3: 1994 171 1236.232
## 4: 1995 172 1254.364
## 5: 1996 178 1327.397
## 6: 1997 166 1402.479
Fitting a linear regression in R is very simple, just use the
lm() function, and the summary() function can
give us the statistics of the model fit:
mod <- lm(GPP ~ GSL, data = com_dt)
summary(mod)
##
## Call:
## lm(formula = GPP ~ GSL, data = com_dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -338.3 -170.2 5.4 120.5 484.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -230.458 683.166 -0.337 0.7382
## GSL 9.898 3.886 2.547 0.0162 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 195 on 30 degrees of freedom
## Multiple R-squared: 0.1778, Adjusted R-squared: 0.1504
## F-statistic: 6.487 on 1 and 30 DF, p-value: 0.01624
plot(com_dt[, .(GSL, GPP)], pch = 16, bty = "L")
# Draw the linear regression line
abline(mod, col = "blue")
# Plot the R^2 value using the legend function
r_sqr <- round(summary(mod)$r.square, 2)
legend("topleft", bty = "n", legend = bquote(R^2 == .(r_sqr)))
# Also plot the p-value
f <- summary(mod)$fstatistic
p_val <- pf(f[1], f[2], f[3], lower.tail = FALSE)
legend("topleft", bty = "n",
legend = paste("p-value: ", formatC(p_val, format = "e", digits = 2)),
y.intersp = 3 # Note this line!
)
Note this is a oversimplified analysis! In the real world, we should look closely to the model fit and the outliers to make sure the mathematical assumptions of linear regression are satisfied.
Reproducibility is CRITICAL in science (https://www.nature.com/articles/d41586-019-00067-3). Get your code version controlled not only facilitates reproducibility but also helps you understand and reuse your code in the future.
# Init the Git repository
git init
# Add README.md to the Git repository
git add README.md
# Add all files to the respository
git add .
# Commit changes
git commit -a -m "Create README"
# Display status
git status -s
# Check history of commits
git log
# Checkout a branch
git checkout -b newbranch
a <- 1 not a<-1.